Normalize reads and create DGE objects
library(tidyverse)
[30m── [1mAttaching packages[22m ─────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.1.1 [32m✔[30m [34mpurrr [30m 0.3.2
[32m✔[30m [34mtibble [30m 2.1.1 [32m✔[30m [34mdplyr [30m 0.8.0.[31m1[30m
[32m✔[30m [34mtidyr [30m 0.8.3 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.4.0 [39m
[30m── [1mConflicts[22m ────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(edgeR)
Loading required package: limma
files <- dir("../input/Kallisto_output/", include.dirs = TRUE)
files %>% head()
[1] "1a1_q_002_S1_R1_001" "1a2_q_003_S2_R1_001"
[3] "1a3_q_004_S3_R1_001" "1a4_q_005_S4_R1_001"
[5] "1a5_q_006_S5_R1_001" "1a6_q_007_S6_R1_001"
counts.list <- map(files, ~ read_tsv(
file=file.path("..","input","Kallisto_output",.,"abundance.tsv"),
col_types = "cdddd"))
names(counts.list) <- files
counts <- sapply(counts.list, select, est_counts) %>%
bind_cols(counts.list[[1]][,"target_id"],.)
counts[is.na(counts)] <- 0
colnames(counts) <- sub(".est_counts","",colnames(counts),fixed = TRUE)
counts
write_csv(counts,"../output/2018-timecourse_V3.0_raw_counts_.csv.gz")
key <- readxl::read_excel("../input/tube_no_legend_time_course_2018.xlsx",
na=c("","na"),
col_types=c("text", "text", "text", "skip", "text", "skip", "skip", "skip", "text", "text", "text", "skip", "text", "skip", "skip", "date", "date")) %>%
mutate(sampling_time_specific=format(sampling_time_specific, format="%H:%M:%S"))
key
create reformatted tube_no
key <- key %>%
mutate(tube_no_2 = {
tolower(tube_no) %>%
str_replace("q_([1-9](_|$))", "q_00\\1") %>%
str_replace("q_([1-9][0-9](_|$))", "q_0\\1") }) %>%
select(tube_no, tube_no_2, everything())
key
samples <- tibble(
file=files,
tube_no_2 = str_extract(files, pattern = "q_[0-9]{3}(_d8)?")
)
samples
samples <- left_join(samples, key)
Joining, by = "tube_no_2"
samples <- samples %>%
mutate(sampling_day=str_pad(sampling_day,width=2,side = "left",pad="0")) %>%
mutate(group=str_c(genotype, soil_trt, sampling_day, sampling_time,sep="-"))
samples
counts2 <- counts %>%
as.data.frame() %>%
column_to_rownames(var = "target_id") %>%
as.matrix() %>%
round(0)
samples2 <- samples %>%
select(-tube_no_2, -tube_no, -pot, -plant_no, -sampling_day_specific, -sampling_time_specific) %>%
as.data.frame() %>%
column_to_rownames(var="file")
dge <- DGEList(counts=counts2, samples=samples2, group=samples2$group)
dge <- calcNormFactors(dge)
save(dge, file="../output/timecourseDGE.Rdata")